Startup

Get reference alleles

reference_alleles <- read_tsv("GCh38_reference_alleles.txt", col_names = F) %>% pull(1)
Rows: 26 Columns: 1
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): X1

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genome_region_reference <- get_allele_length(reference_alleles)

Get arcasHLA alignment stats

arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)

Import cell counts

cell_counts_df <- readRDS("isb_sequenced_cell_count.RDS")

Assemble accuracy DF

success_df <- readRDS("isb_success.RDS")
accuracy_df <- readRDS("isb_accuracy_drb345_filtered.RDS")
accuracy_df <- accuracy_df %>% 
  left_join(
    success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(cell_counts_df, by = "sample") %>%
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer") 

Coverage based plots

gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number based plots

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

Subsample READS

Import read subsample data

# Import predicted genotypes
subsample_reads_path <- "/labs/khatrilab/solomonb/covid/isb/subsample"
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_reads_df <- subsample_reads_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_reads_df
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)

Calculate accuracy

# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, "isb_subsample_reads_accuracy.RDS")
# Calculate success
subsample_reads_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_reads_success_df, "isb_subsample_reads_success.RDS")
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>% 
  left_join(
    subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>%  # Calculate coverage
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%  # Calculate n_alleles
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 

Reads plots

gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy") 
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success") 
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)

plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
  • Default arcasHLA cut off 75 reads (log10 = 1.8)
subsample_reads_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(y = n_alleles, x = log10(coverage)))+
  # geom_violin(scale = "count", alpha = 0.5, )+
  geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  # coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()

Subsample CELLS

# Import predicted genotypes
subsample_cells_path <- "/labs/khatrilab/solomonb/covid/isb/subsample_cells"
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_cells_df <- subsample_cells_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_cells_df
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
# Import cell counts for each subsample
subsample_cell_counts_df <- tibble(line = system("wc -l /labs/khatrilab/solomonb/covid/isb/subsample_cells/barcodes/INCOV*", intern = T)) %>% 
  separate(line, into = c("n_cells", "file"), sep = " /") %>% 
  drop_na() %>% 
  mutate(sample = gsub("\\..*","",basename(file))) %>% 
  mutate(n_cells = as.numeric(n_cells)) %>% 
  select(-file) %>% 
  # separate(sample, into = c("sample", "subset"), sep = "_") %>% 
  bind_rows(cell_counts_df)
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [883].
subsample_cell_counts_df

Calculate accuracy

# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, "isb_subsample_cells_accuracy.RDS")
# subsample_cells_accuracy_df <- readRDS("isb_subsample_cells_accuracy.RDS")
# Calculate success
subsample_cells_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_cells_success_df, "isb_subsample_cells_success.RDS")
subsample_cells_success_df <- readRDS("isb_subsample_cells_success.RDS")
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>% 
  left_join(
    subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(subsample_cell_counts_df, by = "sample") %>%
  bind_rows(accuracy_df) %>% 
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 

Coverage plots

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number plots

gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
`geom_smooth()` using formula 'y ~ x'

plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy") 
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
`geom_smooth()` using formula 'y ~ x'

plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success") 
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
# subsample_cells_combined_df %>% 
#   mutate(reads = reads/n_cells) %>% 
#   filter(reads < 100) %>% 
#   gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F) 

Combined plots

no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
  plot_grid(
    plt_read_accuracy_abc + no_leg,
    plt_read_success_abc + no_leg,
    plt_read_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_abc + no_leg,
    plt_cell_success_abc + no_leg,
    plt_cell_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
  plot_grid(
    plt_read_accuracy_all + no_leg,
    plt_read_success_all + no_leg,
    plt_read_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_all + no_leg,
    plt_cell_success_all + no_leg,
    plt_cell_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Main plot

loci <- c("A", "B", "C", "DPB1", "DQA1", "DRB1")
plt_read_accuracy_subset <- gg_coverage(
  subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), 
  x_var = "reads", y_var = "accuracy") +
  facet_wrap(~locus, nrow = 2)
plt_read_accuracy_subset
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).

save_plot("figures_pdf/v2/5_coverage.pdf", plt_read_accuracy_subset, base_height = 4, base_width = 6)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all
Warning: Removed 62 rows containing missing values (geom_point).

plt_main <- plot_grid(
  plt_read_accuracy_subset,
  plt_read_allele_all,
  ncol = 1,
  rel_heights = c(3,2),
  align = "v",
  axis = "lr",
  labels = c("A", "B")
)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).
Warning: Removed 62 rows containing missing values (geom_point).
plt_main

save_plot("figures_pdf/v2/5_coverage.pdf", plt_main, base_height = 7, base_width = 7)

Slope stats

  # browser()
  df <- df %>% 
    filter(field == field_var) %>% 
    exclude_genotyper_fields() # Replace w/ exclude_genotyper_fields?
Error in UseMethod("filter") : 
  no applicable method for 'filter' applied to an object of class "function"
model_stats <- bind_rows(
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
)  %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") 

Table

col_vars <- c("x_var", "locus")
row_vars <- c("y_var", "genotyper")

flex_df <- model_stats %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") %>% 
  mutate(CI = sprintf("(%s) - (%s)", est_min, est_max)) %>% 
  mutate(genotyper = reformat_hla_genotyper(genotyper)) %>% 
  select(genotyper, locus, CI, x_var, y_var) %>% 
  pivot_wider(names_from = col_vars, values_from = "CI", names_sep =  "-") 
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(col_vars)` instead of `col_vars` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
df_key <- tibble(col_keys = names(flex_df)) %>% 
  filter(!(col_keys %in% c(col_vars, row_vars))) %>% 
  separate(col_keys, into = col_vars, sep = "-", remove = F) %>%
  mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
  arrange_at(col_vars) %>% 
  mutate_all(as.character) %>% 
  mutate_at(col_vars[!(col_vars %in% "locus")], str_to_sentence)

flex_df %>% 
  select(row_vars, everything()) %>% 
  mutate_at(row_vars[!(row_vars %in% "genotyper")], str_to_sentence) %>% 
  # select(locus, df_key$col_keys) %>% 
  flextable() %>% 
  colformat_char(na_str = "---") %>%
  merge_v(j=1:2) %>% 
  set_header_df(mapping = df_key, key = "col_keys") %>% 
  merge_h(part = "header") %>%
  theme_vanilla() %>% 
  # vline(j=vlines_sequence, border = fp_border_default()) %>%
  fix_border_issues() %>% 
  autofit() %>% 
  fontsize(size = 8, part = "all") %>% 
  print(preview = "pptx")
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(row_vars)` instead of `row_vars` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
  df <- df %>% 
    mutate(field = reformat_hla_field(field)) %>% 
    mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
    mutate(cell_value = ifelse(!is.na(se), 
                               sprintf("%s %s %s", round(mean_accuracy,2),"\u00B1",round(se,2)),
                               sprintf("%s", round(mean_accuracy,2)))) %>%
    mutate(cell_value = ifelse(grepl("NA", cell_value), NA, cell_value)) %>% 
    select(-sd,-se, -mean_accuracy) %>% 
    pivot_wider(names_from = nesting_vars, values_from = "cell_value", names_sep = "-") 
Error: Problem with `mutate()` column `field`.
ℹ `field = reformat_hla_field(field)`.
x object 'field' not found
Run `rlang::last_error()` to see where the error occurred.

Scratch work

subsample_cells_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(x = n_alleles, y = log10(n_cells)))+
  geom_violin(scale = "width")+
  geom_jitter(size = 0.2, height = 0, width =0.05, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()

Coverage stats

subsample_cells_combined_df %>% 
  ggplot(aes(x = n_cells, y = coverage, color = genotyper)) + 
  geom_point() +
  facet_wrap(genotyper ~ locus, scales = "free", nrow = 3) + 
  theme_bw() +
  theme(strip.text = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
subsample_cells_combined_df %>% 
  select(-genotyper) %>% distinct() %>% 
  mutate(cov_per_cell = coverage/n_cells) %>% 
  ggplot(aes(x = log10(n_cells), y = cov_per_cell)) + 
  geom_point(color = "black", alpha = 0.1) +
  stat_smooth(method = "lm")+
  # stat_smooth()+
    facet_wrap( ~ locus, scales = "free", nrow = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
subsample_cells_combined_df %>% 
  select(-genotyper) %>% distinct() %>% 
  filter(!is.na(n_cells)) %>% 
  mutate(cov_per_cell = coverage/n_cells) %>% 
  mutate(n_cells = cut(n_cells, seq(0,10000,by=2500), right = F)) %>% 
  ggplot(aes(x = n_cells, y = cov_per_cell)) + 
  stat_summary(fun = function(x) mean(x,na.rm=T), geom = "bar") +
  # geom_point(color = "black", alpha = 0.1) +
  # stat_smooth(method = "lm")+
  # stat_smooth()+
  facet_wrap( ~ locus, scales = "free", nrow = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
hist(cut(1:100, seq(1,100,10)))
subsample_cells_combined_df %>% 
  ungroup() %>% 
  count(genotyper, locus, field)
subsample_reads_combined_df %>% 
  ungroup() %>% 
  count(genotyper, locus, field)
---
title: "3) Coverage Analysis"
output: 
  html_notebook:
    toc: true
    toc_depth: 3
    toc_float: true
    code_folding: hide
---


# Startup 

```{r, message = F}
library(tidyverse)
library(cowplot)
library(ggpmisc)

source("data_import_functions.R")
source("calculation_functions.R")
source("figure_format_functions.R")
all_hla_expanded <- readRDS("all_hla_expanded.RDS")
isb_path <- "/labs/khatrilab/solomonb/covid/isb"
isb_samples <- read_tsv("/labs/khatrilab/solomonb/covid/isb/logs/210217_232725/parallel.log") %>% 
  separate(Command, into = c(NA, "sample"), sep = " ") %>% 
  pull(sample) %>% unique()
```

### Get reference alleles

- GCh38 reference alleles (per https://www.ebi.ac.uk/ipd/imgt/hla/help/genomics.html)
- To be used to calculate coverage

```{r, message = F}
reference_alleles <- read_tsv("GCh38_reference_alleles.txt", col_names = F) %>% pull(1)
genome_region_reference <- get_allele_length(reference_alleles)
```

### Get arcasHLA alignment stats

```{r}
arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)
```

### Import cell counts

```{r}
cell_counts_df <- readRDS("isb_sequenced_cell_count.RDS")
```

### Assemble accuracy DF

```{r}
success_df <- readRDS("isb_success.RDS")
accuracy_df <- readRDS("isb_accuracy_drb345_filtered.RDS")
accuracy_df <- accuracy_df %>% 
  left_join(
    success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(cell_counts_df, by = "sample") %>%
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer") 
```


### Coverage based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


# Subsample READS

### Import read subsample data

```{r}
# Import predicted genotypes
subsample_reads_path <- "/labs/khatrilab/solomonb/covid/isb/subsample"
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
```

```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_reads_df <- subsample_reads_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_reads_df
```

```{r}
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, "isb_subsample_reads_accuracy.RDS")
```


```{r}
# Calculate success
subsample_reads_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_reads_success_df, "isb_subsample_reads_success.RDS")
```

```{r}
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>% 
  left_join(
    subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>%  # Calculate coverage
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%  # Calculate n_alleles
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_reads_combined_df, "subsample_reads_combined_df.RDS")
# subsample_reads_combined_df <- readRDS("subsample_reads_combined_df.RDS")
```

### Reads plots
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
```
- Default arcasHLA cut off 75 reads (log10 = 1.8)

```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_reads_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(y = n_alleles, x = log10(coverage)))+
  # geom_violin(scale = "count", alpha = 0.5, )+
  geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  # coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()
```

# Subsample CELLS

```{r}
# Import predicted genotypes
subsample_cells_path <- "/labs/khatrilab/solomonb/covid/isb/subsample_cells"
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
```


```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_cells_df <- subsample_cells_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_cells_df
```

```{r}
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
```

```{r}
# Import cell counts for each subsample
subsample_cell_counts_df <- tibble(line = system("wc -l /labs/khatrilab/solomonb/covid/isb/subsample_cells/barcodes/INCOV*", intern = T)) %>% 
  separate(line, into = c("n_cells", "file"), sep = " /") %>% 
  drop_na() %>% 
  mutate(sample = gsub("\\..*","",basename(file))) %>% 
  mutate(n_cells = as.numeric(n_cells)) %>% 
  select(-file) %>% 
  # separate(sample, into = c("sample", "subset"), sep = "_") %>% 
  bind_rows(cell_counts_df)
subsample_cell_counts_df
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, "isb_subsample_cells_accuracy.RDS")
# subsample_cells_accuracy_df <- readRDS("isb_subsample_cells_accuracy.RDS")
```

```{r}
# Calculate success
subsample_cells_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_cells_success_df, "isb_subsample_cells_success.RDS")
subsample_cells_success_df <- readRDS("isb_subsample_cells_success.RDS")
```

```{r}
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>% 
  left_join(
    subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(subsample_cell_counts_df, by = "sample") %>%
  bind_rows(accuracy_df) %>% 
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
saveRDS(subsample_cells_combined_df, "subsample_cells_combined_df.RDS")
```

### Coverage plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


```{r, fig.width=12, fig.height=3, warning = F, message = F}
# subsample_cells_combined_df %>% 
#   mutate(reads = reads/n_cells) %>% 
#   filter(reads < 100) %>% 
#   gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F) 
```








# Combined plots
```{r, fig.width = 9, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
plot_grid(
  plot_grid(
    plt_read_accuracy_abc + no_leg,
    plt_read_success_abc + no_leg,
    plt_read_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_abc + no_leg,
    plt_cell_success_abc + no_leg,
    plt_cell_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

```{r, fig.width = 18, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
plot_grid(
  plot_grid(
    plt_read_accuracy_all + no_leg,
    plt_read_success_all + no_leg,
    plt_read_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_all + no_leg,
    plt_cell_success_all + no_leg,
    plt_cell_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

### Main plot

```{r}
loci <- c("A", "B", "C", "DPB1", "DQA1", "DRB1")
plt_read_accuracy_subset <- gg_coverage(
  subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), 
  x_var = "reads", y_var = "accuracy") +
  facet_wrap(~locus, nrow = 2)
plt_read_accuracy_subset
```
```{r}
save_plot("figures_pdf/v2/5_coverage.pdf", plt_read_accuracy_subset, base_height = 4, base_width = 6)
```

```{r}
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all
```
```{r, fig.width = 7, fig.height=7}
plt_main <- plot_grid(
  plt_read_accuracy_subset,
  plt_read_allele_all,
  ncol = 1,
  rel_heights = c(3,2),
  align = "v",
  axis = "lr",
  labels = c("A", "B")
)
plt_main
```
```{r}
save_plot("figures_pdf/v2/5_coverage.pdf", plt_main, base_height = 7, base_width = 7)
```

# Slope stats

```{r}
get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
  # Input checks
  arg_col <- makeAssertCollection()
  assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
  assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
  assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
  assertLogical(x_log, add = arg_col)
  if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
  
  # browser()
  df <- df %>% 
    filter(field == field_var) %>% 
    exclude_genotyper_fields() # Replace w/ exclude_genotyper_fields?
  if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
  df %>%
  filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>% 
  group_by(locus, field, genotyper) %>%
  nest() %>%
  mutate(data = map(data, function(x){
    summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
  })) %>% unnest_wider(data) %>%
  mutate(est_min = round(estimate - 2*std.error, 2),
         est_max = round(estimate + 2*std.error, 2))
}

model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
```

```{r}
model_stats <- bind_rows(
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
)  %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") 
```



# Table
```{r}
col_vars <- c("x_var", "locus")
row_vars <- c("y_var", "genotyper")

flex_df <- model_stats %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") %>% 
  mutate(CI = sprintf("(%s) - (%s)", est_min, est_max)) %>% 
  mutate(genotyper = reformat_hla_genotyper(genotyper)) %>% 
  select(genotyper, locus, CI, x_var, y_var) %>% 
  pivot_wider(names_from = col_vars, values_from = "CI", names_sep =  "-") 

df_key <- tibble(col_keys = names(flex_df)) %>% 
  filter(!(col_keys %in% c(col_vars, row_vars))) %>% 
  separate(col_keys, into = col_vars, sep = "-", remove = F) %>%
  mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
  arrange_at(col_vars) %>% 
  mutate_all(as.character) %>% 
  mutate_at(col_vars[!(col_vars %in% "locus")], str_to_sentence)

flex_df %>% 
  select(row_vars, everything()) %>% 
  mutate_at(row_vars[!(row_vars %in% "genotyper")], str_to_sentence) %>% 
  # select(locus, df_key$col_keys) %>% 
  flextable() %>% 
  colformat_char(na_str = "---") %>%
  merge_v(j=1:2) %>% 
  set_header_df(mapping = df_key, key = "col_keys") %>% 
  merge_h(part = "header") %>%
  theme_vanilla() %>% 
  # vline(j=vlines_sequence, border = fp_border_default()) %>%
  fix_border_issues() %>% 
  autofit() %>% 
  fontsize(size = 8, part = "all") %>% 
  print(preview = "pptx")
```


```{r}
  df <- df %>% 
    mutate(field = reformat_hla_field(field)) %>% 
    mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
    mutate(cell_value = ifelse(!is.na(se), 
                               sprintf("%s %s %s", round(mean_accuracy,2),"\u00B1",round(se,2)),
                               sprintf("%s", round(mean_accuracy,2)))) %>%
    mutate(cell_value = ifelse(grepl("NA", cell_value), NA, cell_value)) %>% 
    select(-sd,-se, -mean_accuracy) %>% 
    pivot_wider(names_from = nesting_vars, values_from = "cell_value", names_sep = "-") 
  
  df_key <- suppressWarnings({tibble(col_keys = names(df)) %>% 
      filter(!grepl("locus", col_keys)) %>% 
      separate(col_keys, into = nesting_vars, sep = "-", remove = F) %>%
      mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
      arrange_at(nesting_vars) %>% 
      mutate_all(as.character)
  })
  
  vline_var <- tail(nesting_vars,1)
  vlines_sequence <- seq(1,nrow(df_key),by = length(unique(df_key[[vline_var]])))
  df <- df %>% 
    select(locus, df_key$col_keys) %>% 
    flextable() %>% 
    colformat_char(na_str = "---") %>% 
    set_header_df(mapping = df_key, key = "col_keys") %>% 
    merge_h(part = "header") %>% 
    theme_vanilla() %>% 
    vline(j=vlines_sequence, border = fp_border_default()) %>%
    fix_border_issues()
  return(df)
```





# Scratch work

```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_cells_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(x = n_alleles, y = log10(n_cells)))+
  geom_violin(scale = "width")+
  geom_jitter(size = 0.2, height = 0, width =0.05, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()
```
# Coverage stats
```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_cells_combined_df %>% 
  ggplot(aes(x = n_cells, y = coverage, color = genotyper)) + 
  geom_point() +
  facet_wrap(genotyper ~ locus, scales = "free", nrow = 3) + 
  theme_bw() +
  theme(strip.text = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
```

```{r, fig.width=16, fig.height=3, warning = F, message = F}
subsample_cells_combined_df %>% 
  select(-genotyper) %>% distinct() %>% 
  mutate(cov_per_cell = coverage/n_cells) %>% 
  ggplot(aes(x = log10(n_cells), y = cov_per_cell)) + 
  geom_point(color = "black", alpha = 0.1) +
  stat_smooth(method = "lm")+
  # stat_smooth()+
    facet_wrap( ~ locus, scales = "free", nrow = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
```

```{r, fig.width=16, fig.height=6, warning = F, message = F}
subsample_cells_combined_df %>% 
  select(-genotyper) %>% distinct() %>% 
  filter(!is.na(n_cells)) %>% 
  mutate(cov_per_cell = coverage/n_cells) %>% 
  mutate(n_cells = cut(n_cells, seq(0,10000,by=2500), right = F)) %>% 
  ggplot(aes(x = n_cells, y = cov_per_cell)) + 
  stat_summary(fun = function(x) mean(x,na.rm=T), geom = "bar") +
  # geom_point(color = "black", alpha = 0.1) +
  # stat_smooth(method = "lm")+
  # stat_smooth()+
  facet_wrap( ~ locus, scales = "free", nrow = 1) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))+
  scale_color_brewer(palette = "Dark2")
```

```{r}
hist(cut(1:100, seq(1,100,10)))
```

```{r}
subsample_cells_combined_df %>% 
  ungroup() %>% 
  count(genotyper, locus, field)
```
```{r}
subsample_reads_combined_df %>% 
  ungroup() %>% 
  count(genotyper, locus, field)
```
